R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

Install Libraries

library(dplyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(scales)
library(boot)

Load and Transform Data

nba_data <- read.csv("nba_2022-23_all_stats_with_salary.csv")

full_statistics <- nba_data %>% select(Salary, GP:VORP)
head(full_statistics)
##     Salary GP GS   MP   FG  FGA   FG. X3P X3PA  X3P. X2P X2PA  X2P.  eFG.  FT
## 1 48070014 56 56 34.7 10.0 20.2 0.493 4.9 11.4 0.427 5.1  8.8 0.579 0.614 4.6
## 2 47345760 34  3 22.2  4.1  9.9 0.408 1.0  3.2 0.303 3.1  6.7 0.459 0.457 2.3
## 3 47080179 73 24 29.1  5.9 13.6 0.436 1.2  3.9 0.311 4.7  9.7 0.487 0.481 2.8
## 4 44474988 55 54 35.5 11.1 22.2 0.500 2.2  6.9 0.321 8.9 15.3 0.580 0.549 4.6
## 5 44119845 47 47 35.6 10.3 18.3 0.560 2.0  4.9 0.404 8.3 13.4 0.617 0.614 6.5
## 6 43279250 50 50 33.5  8.9 17.6 0.506 1.6  4.4 0.365 7.3 13.2 0.552 0.551 3.8
##   FTA   FT. ORB DRB TRB AST STL BLK TOV  PF  PTS Total.Minutes  PER   TS. X3PAr
## 1 5.0 0.915 0.7 5.4 6.1 6.3 0.9 0.4 3.2 2.1 29.4          1941 24.1 0.656 0.564
## 2 3.3 0.681 0.4 2.3 2.7 5.2 0.8 0.4 2.4 1.7 11.4           755 13.6 0.498 0.322
## 3 4.3 0.656 1.2 4.6 5.8 7.5 1.0 0.5 3.5 2.2 15.9          2126 16.1 0.513 0.289
## 4 5.9 0.768 1.2 7.1 8.3 6.8 0.9 0.6 3.2 1.6 28.9          1954 23.9 0.583 0.309
## 5 7.1 0.919 0.4 6.3 6.7 5.0 0.7 1.4 3.3 2.1 29.1          1672 25.9 0.677 0.267
## 6 4.6 0.842 0.8 3.1 3.9 5.4 0.9 0.7 2.9 2.1 23.2          1673 19.7 0.593 0.249
##     FTr ORB. DRB. TRB. AST. STL. BLK. TOV. USG.  OWS DWS  WS WS.48 OBPM DBPM
## 1 0.248  2.3 16.8  9.7 30.0  1.3  0.9 12.5 31.0  5.8 2.0 7.8 0.192  7.5  0.1
## 2 0.334  2.1 11.4  6.8 35.3  1.8  1.4 17.1 27.0 -0.4 0.7 0.3 0.020 -0.8 -0.4
## 3 0.317  4.7 16.5 10.8 38.6  1.7  1.3 18.4 27.7 -0.6 2.6 1.9 0.044  0.3 -0.1
## 4 0.268  3.7 20.8 12.5 33.5  1.2  1.4 11.6 33.3  3.2 2.4 5.6 0.138  5.5  0.6
## 5 0.387  1.2 19.5 10.5 24.5  1.0  3.4 13.4 30.7  4.7 2.1 6.8 0.194  6.0  1.2
## 6 0.260  2.8  9.9  6.5 26.6  1.3  1.7 12.9 29.2  2.2 1.2 3.4 0.099  2.9 -1.2
##    BPM VORP
## 1  7.5  4.7
## 2 -1.2  0.1
## 3  0.2  1.2
## 4  6.1  4.0
## 5  7.1  3.9
## 6  1.8  1.6
basic_statistics <- nba_data %>% select(Salary, GP:X2P., FT:Total.Minutes)
head(basic_statistics)
##     Salary GP GS   MP   FG  FGA   FG. X3P X3PA  X3P. X2P X2PA  X2P.  FT FTA
## 1 48070014 56 56 34.7 10.0 20.2 0.493 4.9 11.4 0.427 5.1  8.8 0.579 4.6 5.0
## 2 47345760 34  3 22.2  4.1  9.9 0.408 1.0  3.2 0.303 3.1  6.7 0.459 2.3 3.3
## 3 47080179 73 24 29.1  5.9 13.6 0.436 1.2  3.9 0.311 4.7  9.7 0.487 2.8 4.3
## 4 44474988 55 54 35.5 11.1 22.2 0.500 2.2  6.9 0.321 8.9 15.3 0.580 4.6 5.9
## 5 44119845 47 47 35.6 10.3 18.3 0.560 2.0  4.9 0.404 8.3 13.4 0.617 6.5 7.1
## 6 43279250 50 50 33.5  8.9 17.6 0.506 1.6  4.4 0.365 7.3 13.2 0.552 3.8 4.6
##     FT. ORB DRB TRB AST STL BLK TOV  PF  PTS Total.Minutes
## 1 0.915 0.7 5.4 6.1 6.3 0.9 0.4 3.2 2.1 29.4          1941
## 2 0.681 0.4 2.3 2.7 5.2 0.8 0.4 2.4 1.7 11.4           755
## 3 0.656 1.2 4.6 5.8 7.5 1.0 0.5 3.5 2.2 15.9          2126
## 4 0.768 1.2 7.1 8.3 6.8 0.9 0.6 3.2 1.6 28.9          1954
## 5 0.919 0.4 6.3 6.7 5.0 0.7 1.4 3.3 2.1 29.1          1672
## 6 0.842 0.8 3.1 3.9 5.4 0.9 0.7 2.9 2.1 23.2          1673
#Used this link: https://faq.stathead.com/knowledge/basketball-advanced-stats to determine the advanced statistics
advanced_statistics <- nba_data %>% select(Salary, eFG., PER:VORP)
head(advanced_statistics)
##     Salary  eFG.  PER   TS. X3PAr   FTr ORB. DRB. TRB. AST. STL. BLK. TOV. USG.
## 1 48070014 0.614 24.1 0.656 0.564 0.248  2.3 16.8  9.7 30.0  1.3  0.9 12.5 31.0
## 2 47345760 0.457 13.6 0.498 0.322 0.334  2.1 11.4  6.8 35.3  1.8  1.4 17.1 27.0
## 3 47080179 0.481 16.1 0.513 0.289 0.317  4.7 16.5 10.8 38.6  1.7  1.3 18.4 27.7
## 4 44474988 0.549 23.9 0.583 0.309 0.268  3.7 20.8 12.5 33.5  1.2  1.4 11.6 33.3
## 5 44119845 0.614 25.9 0.677 0.267 0.387  1.2 19.5 10.5 24.5  1.0  3.4 13.4 30.7
## 6 43279250 0.551 19.7 0.593 0.249 0.260  2.8  9.9  6.5 26.6  1.3  1.7 12.9 29.2
##    OWS DWS  WS WS.48 OBPM DBPM  BPM VORP
## 1  5.8 2.0 7.8 0.192  7.5  0.1  7.5  4.7
## 2 -0.4 0.7 0.3 0.020 -0.8 -0.4 -1.2  0.1
## 3 -0.6 2.6 1.9 0.044  0.3 -0.1  0.2  1.2
## 4  3.2 2.4 5.6 0.138  5.5  0.6  6.1  4.0
## 5  4.7 2.1 6.8 0.194  6.0  1.2  7.1  3.9
## 6  2.2 1.2 3.4 0.099  2.9 -1.2  1.8  1.6

Plot Salaries by Position

nba_data$Position <- factor(nba_data$Position, levels = c("PG", "PG-SG", "SG", "SG-PG", "SF", "SF-SG", "SF-PF", "PF", "C"))

ggplot(nba_data, aes(x = Position, y = Salary)) +
  geom_boxplot(outlier.shape = 1) +
  scale_y_continuous(labels = label_comma()) +
  labs(title = "Salaries by Position", x = "Position", y = "Salary") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Perform ANOVA Test

#If the p-value is less than the significance level (0.05), conclude that there is a statistically significant difference in salaries among the different positions
anova_model <- aov(Salary ~ Position, data = nba_data)
summary(anova_model)
##              Df    Sum Sq   Mean Sq F value Pr(>F)  
## Position      8 1.794e+15 2.243e+14   1.989 0.0462 *
## Residuals   458 5.164e+16 1.127e+14                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Perform Tukey’s Test

#Use the results to identify the positions with statistically significant differences in mean salaries
tukey_results <- TukeyHSD(anova_model)
tukey_results
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Salary ~ Position, data = nba_data)
## 
## $Position
##                    diff       lwr         upr     p adj
## PG-SG-PG      9878955.0 -13824077 33581986.87 0.9314209
## SG-PG        -4898272.3  -9771392   -25152.95 0.0476956
## SG-PG-PG      5071233.5 -18631798 28774265.37 0.9991405
## SF-PG        -3447320.6  -8571677  1677035.46 0.4768417
## SF-SG-PG       660189.5 -23042842 24363221.37 1.0000000
## SF-PF-PG     -8579573.5 -41887888 24728740.78 0.9967653
## PF-PG        -2694528.1  -7886710  2497654.14 0.7949870
## C-PG         -4296851.4  -9421207   827504.68 0.1838278
## SG-PG-SG    -14777227.3 -38380908  8826453.19 0.5783584
## SG-PG-PG-SG  -4807721.5 -37901832 28286389.15 0.9999528
## SF-PG-SG    -13326275.6 -36983103 10330551.45 0.7113125
## SF-SG-PG-SG  -9218765.5 -42312876 23875345.15 0.9944478
## SF-PF-PG-SG -18458528.5 -58990371 22073313.80 0.8900961
## PF-PG-SG    -12573483.2 -36245095 11098128.50 0.7731829
## C-PG-SG     -14175806.4 -37832633  9481020.67 0.6364911
## SG-PG-SG      9969505.8 -13634175 33573186.33 0.9262137
## SF-SG         1450951.7  -3192220  6094123.03 0.9880391
## SF-SG-SG      5558461.8 -18045219 29162142.33 0.9982826
## SF-PF-SG     -3681301.2 -36918988 29556385.46 0.9999942
## PF-SG         2203744.2  -2514176  6921664.44 0.8750890
## C-SG           601420.9  -4041750  5244592.25 0.9999805
## SF-SG-PG     -8518554.1 -32175381 15138272.95 0.9707026
## SF-SG-SG-PG  -4411044.0 -37505155 28683066.65 0.9999757
## SF-PF-SG-PG -13650807.0 -54182649 26881035.30 0.9806514
## PF-SG-PG     -7765761.7 -31437373 15905850.00 0.9836239
## C-SG-PG      -9368084.9 -33024912 14288742.17 0.9485794
## SF-SG-SF      4107510.1 -19549317 27764337.17 0.9998172
## SF-PF-SF     -5132252.9 -38407702 28143196.71 0.9999255
## PF-SF          752792.5  -4224205  5729790.00 0.9999358
## C-SF          -849530.8  -5755728  4056666.60 0.9998209
## SF-PF-SF-SG  -9239763.0 -49771605 31292079.30 0.9986393
## PF-SF-SG     -3354717.7 -27026329 20316894.00 0.9999610
## C-SF-SG      -4957040.9 -28613868 18699786.17 0.9992617
## PF-SF-PF      5885045.3 -27400917 39171007.53 0.9997905
## C-SF-PF       4282722.1 -28992727 37558171.71 0.9999814
## C-PF         -1602323.2  -6579321  3374674.30 0.9854873

Plot Results of Tukey’s Test

tukey_data <- as.data.frame(TukeyHSD(anova_model)$Position)
tukey_data$Comparison <- row.names(tukey_data)

ggplot(tukey_data, aes(x = Comparison, y = diff)) +
  geom_errorbar(aes(ymin = lwr, ymax = upr), width = 0.2) +
  geom_point() +
  coord_flip() +
  labs(x = "Difference in Mean Levels of Position", y = "", title = "95% Family-Wise Confidence Level") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.text.x = element_text(size = 6, hjust = 1, angle = 45),
    axis.text.y = element_text(size = 6, hjust = 1)
  ) +
  scale_y_continuous(labels = label_comma())

Generate Correlation Heatmap for Basic Statistics

basic_correlation_matrix <- cor(basic_statistics, use = "complete.obs")
melted_basic_correlation_matrix <- melt(basic_correlation_matrix)

ggplot(data = melted_basic_correlation_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  #Specified HEX color values to match NBA logo color scheme
  scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.text.x = element_text(size = 6, hjust = 1, vjust = 1, angle = 45),
    axis.text.y = element_text(size = 6),
    legend.key.size = unit(0.5, "cm"),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6)
  ) +
  labs(title = "Correlation Heatmap of Basic Statistics and Salaries", x = "", y = "") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 2)

Generate Correlation Heatmap for Advanced Statistics

advanced_correlation_matrix <- cor(advanced_statistics, use = "complete.obs")
melted_advanced_correlation_matrix <- melt(advanced_correlation_matrix)

ggplot(data = melted_advanced_correlation_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  #Specified HEX color values to match NBA logo color scheme
  scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.text.x = element_text(size = 6, hjust = 1, vjust = 1, angle = 45),
    axis.text.y = element_text(size = 6),
    legend.key.size = unit(0.5, "cm"),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6)
  ) +
  labs(title = "Correlation Heatmap of Advanced Statistics and Salaries", x = "", y = "") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 2)

Generate Correlation Heatmap for Full Statistics

full_correlation_matrix <- cor(full_statistics, use = "complete.obs")
melted_full_correlation_matrix <- melt(full_correlation_matrix)

ggplot(data = melted_full_correlation_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  #Specified HEX color values to match NBA logo color scheme
  scale_fill_gradient2(low = "#da1a33", mid = "#ffffff", high = "#00428c",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.text.x = element_text(size = 4, hjust = 1, vjust = 1, angle = 45),
    axis.text.y = element_text(size = 4),
    legend.key.size = unit(0.5, "cm"),
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6)
  ) +
  labs(title = "Correlation Heatmap of Full Statistics and Salaries", x = "", y = "") +
  geom_text(aes(label = sprintf("%.2f", value)), size = 1)

View Correlations Between Full Statistics and Salaries (in Descending Order)

salary_correlations <- full_correlation_matrix["Salary", ]
sorted_salary_correlations <- sort(salary_correlations, decreasing = TRUE)
sorted_salary_correlations
##        Salary           PTS            FG           FGA          VORP 
##  1.0000000000  0.7191173336  0.7118509280  0.6966823647  0.6768492637 
##          X2PA           X2P            FT           FTA           TOV 
##  0.6715820234  0.6675550693  0.6645510390  0.6595895988  0.6431081241 
##            MP            WS          OBPM           AST            GS 
##  0.6417385312  0.6162372741  0.6000296867  0.5891035926  0.5889257613 
##           OWS           DRB           BPM          USG.           DWS 
##  0.5765526833  0.5677576398  0.5577456362  0.5565113689  0.5498690356 
## Total.Minutes           PER           TRB          X3PA           X3P 
##  0.5463765746  0.5373134923  0.4925254786  0.4739628546  0.4621655033 
##           STL          AST.            PF         WS.48            GP 
##  0.4616304850  0.4547321061  0.3861162032  0.3448925913  0.3009336031 
##           BLK           ORB           TS.           FTr           FT. 
##  0.2908862283  0.2002398018  0.1920965177  0.1750457743  0.1565122461 
##           FG.          eFG.          DRB.          DBPM          X3P. 
##  0.1294987211  0.1204278512  0.1147237989  0.1121376540  0.0667153966 
##          X2P.          TRB.          STL.          BLK.          TOV. 
##  0.0439189198  0.0207517590 -0.0003385747 -0.0465065932 -0.0551807178 
##         X3PAr          ORB. 
## -0.1173386874 -0.1368441671

Fit Multiple Regression Model

linear_model <- lm(Salary ~ PTS + FG + VORP, data = nba_data)
summary(linear_model)
## 
## Call:
## lm(formula = Salary ~ PTS + FG + VORP, data = nba_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -23837506  -3191890   -603193   2088031  38410717 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -148496     615409  -0.241   0.8094    
## PTS           701183     381109   1.840   0.0664 .  
## FG            198124    1046821   0.189   0.8500    
## VORP         2777388     427849   6.492 2.19e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7055000 on 463 degrees of freedom
## Multiple R-squared:  0.5688, Adjusted R-squared:  0.566 
## F-statistic: 203.6 on 3 and 463 DF,  p-value: < 2.2e-16
#Referenced this link: https://www.statology.org/r-vs-r-squared/ to approximate the model correlation
model_correlation <- sqrt(summary(linear_model)$r.squared)
model_correlation
## [1] 0.754166

Plot Multiple Regression Model Diagnostics (for Analysis of Outliers or High Leverage Observations)

par(mfrow=c(2,2))
plot(linear_model)

Plot Multiple Regression Model (Actual vs. Predicted Values)

nba_data$predicted_salary <- predict(linear_model, newdata = nba_data)

ggplot(nba_data, aes(x = Salary, y = predicted_salary)) +
  #Specified HEX color values to match NBA logo color scheme
  geom_point(alpha = 0.5, color = "#00428c") +
  geom_abline(intercept = 0, slope = 1, color = "#da1a33") +
  scale_x_continuous(labels = label_comma()) +
  scale_y_continuous(labels = label_comma()) +
  labs(x = "Actual Salaries", y = "Predicted Salaries", title = "Actual vs. Predicted Salaries") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12),
    axis.title.x = element_text(size = 10, margin = margin(t = 10, unit = "pt")),
    axis.title.y = element_text(size = 10, margin = margin(r = 10, unit = "pt")),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 6),
    plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
    )

Perform Bootstrapping on Multiple Regression Model

#Referenced this link: https://www.statmethods.net/advstats/bootstrapping.html for information on bootstrapping
bs <- function(formula, data, indices) {
  d <- data[indices, ]
  model <- lm(formula, data = d)
  return(coef(model))
}

set.seed(123)
bootstrap_results <- boot(data = nba_data, statistic = bs, R = 1000, formula = Salary ~ PTS + FG + VORP)
bootstrap_results
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = nba_data, statistic = bs, R = 1000, formula = Salary ~ 
##     PTS + FG + VORP)
## 
## 
## Bootstrap Statistics :
##      original    bias    std. error
## t1* -148496.1  19296.47    506516.6
## t2*  701183.3  20536.56    435911.6
## t3*  198124.3 -64345.66   1204520.8
## t4* 2777387.9  16439.01    503068.9

Derive Confidence Intervals from Bootstrap Results

boot.ci(bootstrap_results, type = "bca", index = 1) #Intercept
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   (-1145464,   823691 )  
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 2) #PTS
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 2)
## 
## Intervals : 
## Level       BCa          
## 95%   (-206913, 1555303 )  
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 3) #FG
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 3)
## 
## Intervals : 
## Level       BCa          
## 95%   (-2090459,  2746448 )  
## Calculations and Intervals on Original Scale
boot.ci(bootstrap_results, type = "bca", index = 4) #VORP
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bootstrap_results, type = "bca", index = 4)
## 
## Intervals : 
## Level       BCa          
## 95%   (1798436, 3715748 )  
## Calculations and Intervals on Original Scale

Plot Bootstrap Results

plot(bootstrap_results, index = 1) #Intercept

plot(bootstrap_results, index = 2) #PTS

plot(bootstrap_results, index = 3) #FG

plot(bootstrap_results, index = 4) #VORP